Using clustered SEs and fixed effects

Violating non-independence

Consider a (contrived) scenario where you just accidentally include a bunch of duplicated responses:

set.seed(500)

N<-500
ID = seq(N) # an id for each observation
X<-rnorm(N) # the IV
Y<-1  + rnorm(N, 0 ,10) # the DV

df<-data.frame(
  X,
  Y,
  ID
)
# duplicating the data
df_duplicated <- replicate(100, df, simplify=FALSE)|>
  bind_rows()
model<-lm(Y ~ X, data=df)
model_duplicated <- lm(Y ~ X, data=df_duplicated)

Violating non-independence

Obviously, the second model is just vastly overestimating our real sample size. We may have 60 rows in our dataset, but there’s only 500 independent observations here.

modelsummary(list(model, 
                  model_duplicated))
tinytable_7iw18d0ovu1rve3zzvob
(1) (2)
(Intercept) 0.468 0.468
(0.440) (0.044)
X -0.264 -0.264
(0.433) (0.043)
Num.Obs. 500 50000
R2 0.001 0.001
R2 Adj. -0.001 0.001
AIC 3708.0 370209.8
BIC 3720.7 370236.2
Log.Lik. -1851.019 -185101.893
F 0.374 37.502
RMSE 9.81 9.81

Clustering error terms

If we know the source of the clustering, we can adjust our standard errors to account for it. Notice that we’re able to approximately get the correct standard error size here using the duplicated data:

library(sandwich)
library(lmtest)
model1_robust <- coeftest(model_duplicated, 
                          vcov = vcovCL,
                          type='HC2',

                          cluster = ~ID
                          )
model1_robust

t test of coefficients:

            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.46843    0.44167  1.0606   0.2889
X           -0.26447    0.43473 -0.6084   0.5430

Or, we can estimate bootstrapped standard errors:

model1_robust_boot <- coeftest(model_duplicated, 
                          vcov = vcovBS(model_duplicated, 
                                        cluster = ~ID, 
                                        type='xy')
                          )

model1_robust_boot

t test of coefficients:

            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.46843    0.44393  1.0552   0.2913
X           -0.26447    0.41921 -0.6309   0.5281

We can put these all together in a shared table to compare results:

mlist<-list('original' = model, 
            'duplicates' = model_duplicated,
            'duplicates + cluster robust se' = model1_robust,
            'duplicates + cluster bootstrap' = model1_robust_boot)



modelsummary(mlist,
 estimate  = "{estimate}",  
  statistic = c("conf.int", 'std.error'),
 conf_level = .95,        
 note = "note: Standard errors in parentheses, 95% ci in brackets",
 gof_map = c('nobs'))
tinytable_al0yrqa3fs3e7n2d7qrm
original duplicates duplicates + cluster robust se duplicates + cluster bootstrap
note: Standard errors in parentheses, 95% ci in brackets
(Intercept) 0.468 0.468 0.468 0.468
[-0.396, 1.333] [0.382, 0.554] [-0.397, 1.334] [-0.402, 1.339]
(0.440) (0.044) (0.442) (0.444)
X -0.264 -0.264 -0.264 -0.264
[-1.115, 0.586] [-0.349, -0.180] [-1.117, 0.588] [-1.086, 0.557]
(0.433) (0.043) (0.435) (0.419)
Num.Obs. 500 50000 50000 50000

Alternatively, the modelsummary package can calculate standard errors “on the fly” if we specify values for vcov arguments:

modelsummary(model_duplicated, 
             vcov = function(x) vcovCL(x, cluster=~ID))
tinytable_qosss7257szl0s5v00wu
(1)
(Intercept) 0.468
(0.441)
X -0.264
(0.434)
Num.Obs. 50000
R2 0.001
R2 Adj. 0.001
AIC 370209.8
BIC 370236.2
Log.Lik. -185101.893
RMSE 9.81
Std.Errors Custom
mlist<-list('original' = model, 
            'duplicates' = model_duplicated,
            'duplicates + cluster robust se' = model_duplicated,
            'duplicates + cluster bootstrap' = model_duplicated)

modelsummary(mlist,
 estimate  = "{estimate}",  
  statistic = c("conf.int", 'std.error'),
 conf_level = .95,        
 note = "note: Standard errors in parentheses. 95% ci in brackets",
 gof_omit = 'F|RMSE|R2$|AIC|Log.Lik.',
 vcov = c("classical","classical", function(x) vcovCL(x, cluster=~ID), function(x) vcovBS(x, cluster=~ID)))
tinytable_0a6egtfeylrw7ix89fvg
original duplicates duplicates + cluster robust se duplicates + cluster bootstrap
note: Standard errors in parentheses. 95% ci in brackets
(Intercept) 0.468 0.468 0.468 0.468
[-0.396, 1.333] [0.382, 0.554] [-0.396, 1.333] [-0.425, 1.362]
(0.440) (0.044) (0.441) (0.456)
X -0.264 -0.264 -0.264 -0.264
[-1.115, 0.586] [-0.349, -0.180] [-1.114, 0.585] [-1.147, 0.618]
(0.433) (0.043) (0.434) (0.450)
Num.Obs. 500 50000 50000 50000
R2 Adj. -0.001 0.001 0.001 0.001
BIC 3720.7 370236.2 370236.2 370236.2
Std.Errors IID IID Custom Custom

Also note that, in the absence of real clustering, there’s not much that changes here:

set.seed(100)

fakeids<-sample(letters, size=nrow(model$model) ,replace=TRUE)

model0_robust <- coeftest(model, 
                          vcov = vcovCL,
                          cluster = ~fakeids
                          )
model0_robust

t test of coefficients:

            Estimate Std. Error t value Pr(>|t|)
(Intercept)  0.46843    0.45133  1.0379   0.2998
X           -0.26447    0.50125 -0.5276   0.5980

Fixed effects

We’ll bring in the county-level election results data along with some demographic and income measures to estimate a fixed effects model.

library(tidycensus)

# county level election results
counties_24<-read_csv('https://raw.githubusercontent.com/tonmcg/US_County_Level_Election_Results_08-24/refs/heads/master/2024_US_County_Level_Presidential_Results.csv')

county_data<- get_acs(geography = "county", 
                  variables = c(Pop= "B02001_001",
                                Income = "B19013_001",
                                White = "B03002_003", 
                                AfAm = "B03002_004",
                                Hisp = "B03002_012",
                                Asian = "B03002_006"))

county_data_wide<-county_data|>
  select(-moe)|>
  pivot_wider(names_from = variable, values_from=estimate)

counties<-left_join(counties_24, county_data_wide,by=join_by(county_fips == GEOID))|>
  mutate(perc_gop = per_gop * 100, 
         Income = Income / 1000,
         White = (White/Pop) * 100,
         Hisp =(Hisp/Pop) * 100,
         AfAm = (AfAm/Pop) * 100
         )

Fixed Effects

I can use a regular linear regression here to estimate the fixed effects model, but it gives messy results and its slow. So instead I’ll use the fixest package:

fmodel<-feols(perc_gop ~ Income + White + AfAm + Hisp  | state_name, data=counties)

(by default, the feols function also clusters the standard errors on the level of the fixed effect variable)

I can access the fixed effects portion of the model with the fixef function:

fixef(fmodel)
$state_name
             Alabama               Alaska              Arizona 
          48.9672144           47.9685659           36.6186736 
            Arkansas           California             Colorado 
          43.1433168           29.8434350           26.5583549 
         Connecticut             Delaware District of Columbia 
          24.5050361           30.5157939           17.3861555 
             Florida              Georgia               Hawaii 
          42.8727352           49.4820748           41.5433744 
               Idaho             Illinois              Indiana 
          39.8121580           32.7286782           33.9488808 
                Iowa               Kansas             Kentucky 
          29.9427774           40.4630477           36.6899277 
           Louisiana                Maine             Maryland 
          51.2184782           11.5335440           36.9773900 
       Massachusetts             Michigan            Minnesota 
          13.2613462           25.4809706           28.6849395 
         Mississippi             Missouri              Montana 
          49.2203145           38.6727622           34.9826863 
            Nebraska               Nevada        New Hampshire 
          42.5076096           44.2297892           15.2354715 
          New Jersey           New Mexico             New York 
          35.3871503           33.2465147           25.5410508 
      North Carolina         North Dakota                 Ohio 
          38.1317567           41.4730634           33.6453619 
            Oklahoma               Oregon         Pennsylvania 
          50.7526807           24.9446873           30.5665041 
        Rhode Island       South Carolina         South Dakota 
          14.3141242           44.8372734           38.3548022 
           Tennessee                Texas                 Utah 
          42.3411461           52.6338998           40.5997769 
             Vermont             Virginia           Washington 
           0.5335823           37.0057921           24.0827078 
       West Virginia            Wisconsin              Wyoming 
          34.3678128           22.8105510           42.7391033 

attr(,"class")
[1] "fixest.fixef" "list"        
attr(,"exponential")
[1] FALSE

For comparison, I’ll also run a model that doesn’t include state-level fixed effects (I’ll still use feols because the modelsummary package will give some extra information when it has the same types of models)

regmodel<-feols(perc_gop ~ Income + White + AfAm + Hisp , data =counties, cluster ~ state_name)

mlist<-list("No fixed effects" = regmodel, 
            'Fixed effects' = fmodel)


# adding coefficient names
coefnames<-c("Income" = "Median income ($1000)",
             "White" = "% White",
             "Hisp" = "% Hispanic",
             "AfAm" = "% African American"
             )


modelsummary(mlist, 
            estimate  = "{estimate}",  
            statistic = c("conf.int", 'std.error'),
            conf_level = .95,
            coef_rename = coefnames,
            coef_omit ='Intercept',
            gof_map = c('nobs','aic','bic', 'vcov.type',
                        'FE: state_name'
                        )
             )
tinytable_3a7fo6hpy83uveirdcqp
No fixed effects Fixed effects
Median income ($1000) -0.374 -0.257
[-0.444, -0.304] [-0.308, -0.206]
(0.035) (0.025)
% White 0.574 0.592
[0.399, 0.748] [0.525, 0.659]
(0.087) (0.034)
% African American -0.047 -0.249
[-0.247, 0.153] [-0.362, -0.135]
(0.100) (0.057)
% Hispanic 0.427 0.275
[0.181, 0.673] [0.152, 0.398]
(0.122) (0.061)
Num.Obs. 3115 3115
AIC 24166.5 22175.2
BIC 24196.7 22507.6
Std.Errors by: state_name by: state_name
FE: state_name X

We can also plot the coefficients. In this example, I’ll standardize the variables so we can compare coefficient values that have different scales:

modelplot(mlist,
          coef_omit ="Intercept",
          standardize='refit'
          ) +
  geom_vline(xintercept=0, lty=2) +
  labs(title= "Standardized coefficients")